First, I transformed the nominal columns into dummy variables.
I applied a stepwise selection method in both directions (using AIC as the criterion) to identify significant predictors. This process highlighted the following predictors: X9, X2, X1, X14, X6 and X7.
I plotted residuals against each predictor and observed non-linear dependencies for predictors X1, X2, and X9.
To further explore predictor selection, I also tried Lasso regularization, which identified a slightly different set of predictors: X1, X2, X6, X8, X9, and color_green.
Both the classical approach and Lasso regularization showed overlap in selected predictors, particularly X1, X2, X9, and X6.
I set up a naive baseline using a linear model with predictors selected from both the classical approach and Lasso regularization. The baseline model achieved RMSE values of 4.304524 and 4.308791, which were quite similar.
To capture the non-linear relationships in the data, I fitted a Generalized Additive Model (GAM) with the following specifications:
Linear splines with respect to predictor X1 using 7 knots placed at equal quantiles. Natural splines for predictor X2 with 6 degrees of freedom. Smoothing splines for predictor X9. And linear terms with respect to X14, X6 and X7.
After fitting the GAM, I again plotted residuals against the predictors and observed that X2 and X9 still exhibited some non-linear patterns.
Finally, I performed 10-fold cross-validation to evaluate the model and identify the best-performing configuration.
library(tidyverse)
library(tidymodels)
library(leaps)
library(GGally)
library(splines)
library(broom)
set.seed(39692)
df <- read_delim("hw4_train.csv", delim = ",")
Rows: 4000 Columns: 16
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): color, grade, income_class
dbl (13): X1, X2, X3, X6, X7, X8, X9, X11, X12, X13, X14, X15, y
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(df)
X1 X2 X3 color grade X6
Min. :-1.235 Min. :0.06676 Min. :-6.947 Length:4000 Length:4000 Min. :-4.780
1st Qu.: 3.238 1st Qu.:2.56089 1st Qu.:-5.292 Class :character Class :character 1st Qu.:-1.132
Median : 5.066 Median :4.60417 Median :-4.895 Mode :character Mode :character Median : 2.600
Mean : 5.078 Mean :4.60689 Mean :-4.898 Mean : 2.616
3rd Qu.: 6.878 3rd Qu.:6.68131 3rd Qu.:-4.503 3rd Qu.: 6.262
Max. :11.525 Max. :9.24549 Max. :-2.907 Max. :10.141
X7 X8 X9 income_class X11 X12
Min. :0.6348 Min. :-1.077 Min. :-0.8687 Length:4000 Min. :-9.767 Min. :-4.5661
1st Qu.:3.0898 1st Qu.: 3.073 1st Qu.: 2.0285 Class :character 1st Qu.:-6.560 1st Qu.:-1.5610
Median :4.0227 Median : 4.041 Median : 3.2078 Mode :character Median :-5.378 Median :-0.9069
Mean :4.0527 Mean : 4.046 Mean : 3.1790 Mean :-5.392 Mean :-0.8953
3rd Qu.:5.0420 3rd Qu.: 4.985 3rd Qu.: 4.3443 3rd Qu.:-4.217 3rd Qu.:-0.2214
Max. :7.8376 Max. : 8.852 Max. : 7.4246 Max. :-1.038 Max. : 2.7490
X13 X14 X15 y
Min. :-4.6469 Min. :-8.7676 Min. :-1.943 Min. :-16.087
1st Qu.:-1.4875 1st Qu.:-4.0541 1st Qu.: 1.547 1st Qu.: -1.569
Median :-0.6378 Median :-0.4222 Median : 2.914 Median : 5.543
Mean :-0.6260 Mean :-0.2718 Mean : 2.900 Mean : 5.773
3rd Qu.: 0.2467 3rd Qu.: 3.5763 3rd Qu.: 4.250 3rd Qu.: 13.482
Max. : 3.7954 Max. : 7.9442 Max. : 7.399 Max. : 25.434
rec = recipe(y~.,data=df)|>step_dummy(all_nominal_predictors())
df_baked = rec |> prep(df) |> bake(new_data=NULL)
head(df_baked)
null_model <- lm(y ~ 1, data = df_baked)
full_model <- lm(y ~ ., data = df_baked)
stepwise_model <- stats::step(null_model,
scope = list(lower = null_model, upper = full_model),
direction = "both",
trace=0,
#k = log(nrow(df_baked))
) #k = log(n) is sometimes referred to as BIC or SBC.
summary(stepwise_model)
Call:
lm(formula = y ~ X9 + X2 + X1 + X14 + X6 + X7, data = df_baked)
Residuals:
Min 1Q Median 3Q Max
-12.0257 -3.1227 -0.0377 3.1384 12.8373
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -21.29286 0.42101 -50.576 < 2e-16 ***
X9 4.13254 0.05951 69.438 < 2e-16 ***
X2 2.25311 0.03075 73.266 < 2e-16 ***
X1 0.63155 0.03406 18.542 < 2e-16 ***
X14 0.07304 0.02396 3.049 0.00231 **
X6 -0.04299 0.01966 -2.186 0.02884 *
X7 0.11708 0.06562 1.784 0.07449 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.302 on 3993 degrees of freedom
Multiple R-squared: 0.8138, Adjusted R-squared: 0.8135
F-statistic: 2908 on 6 and 3993 DF, p-value: < 2.2e-16
qqnorm(residuals(stepwise_model), main="QQ-Plot of Residuals")
qqline(residuals(stepwise_model), col="red")
# ggplot2 option for a more refined plot
ggplot(data = data.frame(residuals = residuals(stepwise_model)), aes(sample = residuals)) +
stat_qq() +
stat_qq_line(color = "red") +
ggtitle("QQ-Plot of Residuals") +
theme_minimal()
plot_residuals_fn = function(m) {
selected_vars <- all.vars(formula(m))[-1]
continuous_vars <- selected_vars[sapply(df_baked[selected_vars], is.numeric)]
for (var in continuous_vars) {
p <- ggplot(m) + geom_point(aes_string(x = var, y = ".resid")) +
geom_smooth(aes_string(x = var, y = ".resid"), method = 'loess', color = 'blue') +
labs(title = paste("Residuals vs", var), x = var, y = "Residuals")
print(p)
}
}
plot_residuals_fn(stepwise_model)
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
cv_data=vfold_cv(df_baked, v=10)
rec_reg_1 = recipe(y~.,data=df_baked)
wf_reg_1=workflow()|> add_model(linear_reg(penalty=tune(),mixture=1)|>set_engine("glmnet"))|>add_recipe(rec_reg_1)
res_reg_1=tune_grid(wf_reg_1,cv_data,grid=data.frame(penalty=seq(0.02,0.2,by=0.02)))
show_best(res_reg_1)
Warning in show_best(res_reg_1) :
No value of `metric` was given; "rmse" will be used.
wf_reg_1_final = finalize_workflow(wf_reg_1, select_best(res_reg_1, metric = "rmse"))
wf_reg_1_final = wf_reg_1_final|>fit(df_baked)
wf_reg_1_final
══ Workflow [trained] ══════════════════════════════════════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: linear_reg()
── Preprocessor ────────────────────────────────────────────────────────────────────────────────────────────────────────────
0 Recipe Steps
── Model ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = "gaussian", alpha = ~1)
Df %Dev Lambda
1 0 0.00 6.7100
2 1 7.71 6.1140
3 2 15.01 5.5710
4 2 25.98 5.0760
5 2 35.08 4.6250
6 2 42.63 4.2140
7 2 48.91 3.8400
8 2 54.11 3.4990
9 2 58.44 3.1880
10 3 62.15 2.9050
11 3 65.40 2.6470
12 3 68.10 2.4120
13 3 70.35 2.1970
14 3 72.21 2.0020
15 3 73.76 1.8240
16 3 75.04 1.6620
17 3 76.11 1.5150
18 3 76.99 1.3800
19 3 77.73 1.2570
20 3 78.34 1.1460
21 3 78.84 1.0440
22 3 79.26 0.9512
23 3 79.61 0.8667
24 3 79.90 0.7897
25 3 80.14 0.7195
26 3 80.34 0.6556
27 3 80.51 0.5974
28 3 80.65 0.5443
29 3 80.76 0.4959
30 3 80.86 0.4519
31 3 80.93 0.4117
32 3 81.00 0.3752
33 3 81.05 0.3418
34 3 81.10 0.3115
35 3 81.14 0.2838
36 3 81.17 0.2586
37 3 81.19 0.2356
38 3 81.22 0.2147
39 3 81.23 0.1956
40 3 81.25 0.1782
41 3 81.26 0.1624
42 3 81.27 0.1480
43 3 81.28 0.1348
44 3 81.29 0.1228
45 3 81.29 0.1119
46 3 81.30 0.1020
...
and 27 more lines.
wf_reg_1_select <- workflow()|>add_recipe(rec_reg_1)|>add_model(linear_reg(engine="glmnet",mixture=1,penalty=0.08))
wf_reg_1_select_fitted = fit(wf_reg_1_select, df_baked)
selected_vars = tidy(wf_reg_1_select_fitted)|>filter(estimate!=0)|>slice(-1)|>pull(term)
selected_vars
[1] "X1" "X2" "X6" "X8" "X9" "color_green"
set.seed(20240926)
splitted=initial_split(df_baked,prop=0.8)
train_df=training(splitted)
test_df=testing(splitted)
cv_data=vfold_cv(train_df, v=10)
wf_baseline_1 = workflow()|>
add_recipe(recipe(y ~ X1 + X2 + X9 + X14 + X6 + X7, data=train_df))|>
add_model(linear_reg())
res_baseline_1=fit_resamples(wf_baseline_1, cv_data,metrics = metric_set(rmse))
collect_metrics(res_baseline_1)
last_fit_result=last_fit(wf_baseline_1,splitted)
last_fit_result|>collect_metrics()
Try to fit model based on predictiors from Lasso reg.
head(df_baked)
# "X1" "X2" "X6" "X8" "X9" "color_green"
wf_baseline_2 = workflow()|>
add_recipe(recipe(y ~ X1 + X2 + X9 + X6 + X8 + color_green, data=train_df))|>
add_model(linear_reg())
res_baseline_2=fit_resamples(wf_baseline_2, cv_data,metrics = metric_set(rmse))
collect_metrics(res_baseline_2)
last_fit_result=last_fit(wf_baseline_2,splitted)
last_fit_result|>collect_metrics()
# quantile(df_baked$X1, probs = seq(0.1, 0.9, length.out = 5))
m_gam = gen_additive_mod(mode="regression") |> fit(
y ~ splines::bs(X1, knots = c(1.776703, 3.097543, 4.128191, 5.065609, 6.022864, 7.034800, 8.375370)) + splines::ns(X2, df = 6) + s(X9) + X7 + X6 + X14,
data=df_baked
)
#add predictions to data and compute correct residuals
m_gam_aug=augment(m_gam, new_data=df_baked)
quantile(df_baked$X1, probs = seq(0.1, 0.9, length.out = 7))
10% 23.33333% 36.66667% 50% 63.33333% 76.66667% 90%
1.776703 3.097543 4.128191 5.065609 6.022864 7.034800 8.375370
ggplot(m_gam_aug, aes(x=X1))+geom_point(aes(y=.resid))+geom_smooth(aes(y=.resid))
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
ggplot(m_gam_aug, aes(x=X2))+geom_point(aes(y=.resid))+geom_smooth(aes(y=.resid))
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
ggplot(m_gam_aug, aes(x=X9))+geom_point(aes(y=.resid))+geom_smooth(aes(y=.resid))
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
ggplot(m_gam_aug, aes(x=X14))+geom_point(aes(y=.resid))+geom_smooth(aes(y=.resid))
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
ggplot(m_gam_aug, aes(x=X6))+geom_point(aes(y=.resid))+geom_smooth(aes(y=.resid))
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
ggplot(m_gam_aug, aes(x=X7))+geom_point(aes(y=.resid))+geom_smooth(aes(y=.resid))
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
wf_gam_1 = workflow()|>
add_recipe(recipe(y~.,data=train_df))|>
add_model(
gen_additive_mod(mode="regression"),
formula=y ~ splines::bs(X1, knots = c(1.776703, 3.097543, 4.128191, 5.065609, 6.022864, 7.034800, 8.375370)) + splines::ns(X2, df = 6) + s(X9) + X7 + X6 + X14
)
res_gam_1=fit_resamples(wf_gam_1, cv_data,metrics = metric_set(rmse))
→ A | warning: some 'x' values beyond boundary knots may cause ill-conditioned bases
There were issues with some computations A: x1
There were issues with some computations A: x2
There were issues with some computations A: x2
collect_metrics(res_gam_1)
wf_gam_1_result=last_fit(wf_gam_1,splitted)
wf_gam_1_result|>collect_metrics()
wf_gam_2 = workflow()|>
add_recipe(recipe(y~.,data=train_df))|>
add_model(
gen_additive_mod(mode="regression"),
formula=y ~ splines::bs(X1, knots = c(1.776703, 3.097543, 4.128191, 5.065609, 6.022864, 7.034800, 8.375370)) + splines::ns(X2, df = 6) + s(X3) + X7 + X6 + X14
)
res_gam_2=fit_resamples(wf_gam_2, cv_data,metrics = metric_set(rmse))
→ A | warning: some 'x' values beyond boundary knots may cause ill-conditioned bases
There were issues with some computations A: x1
There were issues with some computations A: x2
There were issues with some computations A: x2
collect_metrics(res_gam_2)
wf_gam_2_result=last_fit(wf_gam_2,splitted)
wf_gam_2_result|>collect_metrics()